!git clone https://github.com/microsoft/computervision-recipes.git
%cd computervision-recipes
!pip install decord ipywebrtc einops
%cd /content/video_input/
/content/video_input
from pytube import YouTube
# YouTube('https://www.youtube.com/watch?v=9P7JzTRHz5g').streams.first().download()
# YouTube('https://www.youtube.com/watch?v=0Cl_Q8RjmfI').streams.first().download()
# YouTube('https://www.youtube.com/watch?v=k2eCJ2XI1IA').streams.first().download()
# YouTube('https://www.youtube.com/watch?v=6mhRTDBNQ-M').streams.first().download()
# YouTube('https://www.youtube.com/watch?v=xm9c5HAUBpY').streams.first().download()
# YouTube('https://www.youtube.com/watch?v=K1FPxvdB_to').streams.first().download()
import sys
from collections import deque #
import io
import requests
import os
from time import sleep, time
from threading import Thread
from IPython.display import HTML
from base64 import b64encode

# Third party tools
import decord #
import IPython.display #
from ipywebrtc import CameraStream, ImageRecorder
from ipywidgets import HBox, HTML, Layout, VBox, Widget, Label
import numpy as np
from PIL import Image
import torch
import torch.cuda as cuda
import torch.nn as nn
from torchvision.transforms import Compose

# utils_cv
sys.path.append("/content/computervision-recipes")
from utils_cv.action_recognition.data import KINETICS, Urls
from utils_cv.action_recognition.dataset import get_transforms
from utils_cv.action_recognition.model import VideoLearner
from utils_cv.action_recognition.references import transforms_video as transforms
from utils_cv.common.gpu import system_info, torch_device
from utils_cv.common.data import data_path

%reload_ext autoreload
%autoreload 2

system_info()
3.6.9 (default, Oct  8 2020, 12:12:24) 
[GCC 8.4.0] 

PyTorch 1.7.0+cu101 

Torch-vision 0.8.1+cu101 

Available devices:
CPUs only, no GPUs found
NUM_FRAMES = 8  # 8 or 32.
IM_SCALE = 128  # resize then crop
INPUT_SIZE = 112  # input clip size: 3 x NUM_FRAMES x 112 x 112

# video sample to download
sample_video_url = Urls.webcam_vid

# file path to save video sample
video_fpath = data_path() / "sample_video.mp4"

# prediction score threshold
SCORE_THRESHOLD = 0.01

# Averaging 5 latest clips to make video-level prediction (or smoothing)
AVERAGING_SIZE = 5  
learner = VideoLearner(base_model="kinetics", sample_length=NUM_FRAMES)
Loading r2plus1d_34_8_kinetics model
Using cache found in /root/.cache/torch/hub/moabitcoin_ig65m-pytorch_master
LABELS = KINETICS.class_names
LABELS[:10]
['abseiling',
 'air drumming',
 'answering questions',
 'applauding',
 'applying cream',
 'archery',
 'arm wrestling',
 'arranging flowers',
 'assembling computer',
 'auctioning']
TARGET_LABELS = [
    "assembling computer",
    "applying cream",
    "brushing teeth",
    "clapping",
    "cleaning floor",
    "cleaning windows",
    "drinking",
    "eating burger",
    "eating chips",
    "eating doughnuts",
    "eating hotdog",
    "eating ice cream",
    "fixing hair",
    "hammer throw",
    "high kick",
    "jogging",
    "laughing",
    "mopping floor",
    "moving furniture",
    "opening bottle",
    "plastering",
    "punching bag",
    "punching person (boxing)",
    "pushing cart",
    "reading book",
    "reading newspaper",
    "rock scissors paper",
    "running on treadmill",
    "shaking hands",
    "shaking head",
    "side kick",
    "slapping",
    "smoking",
    "sneezing",
    "spray painting",
    "spraying",
    "stretching arm",
    "stretching leg",
    "sweeping floor",
    "swinging legs",
    "texting",
    "throwing axe",
    "throwing ball",
    "unboxing",
    "unloading truck",
    "using computer",
    "using remote controller (not gaming)",
    "welding",
    "writing",
    "yawning",
]
len(TARGET_LABELS)
50
# path = 'video.mp4'
# mp4 = open(path,'rb').read()
# data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
# HTML("""<video width=400 controls><source src="%s" type="video/mp4"></video>""" % data_url)
r = requests.get(sample_video_url)
open(video_fpath, 'wb').write(r.content)
12609830
video = str(data_path()/"sample_video.mp4")
learner.predict_video(
    video,
    LABELS,
    averaging_size=AVERAGING_SIZE,
    score_threshold=SCORE_THRESHOLD,
    target_labels=TARGET_LABELS,
)
w_cam = CameraStream(
    constraints={
        "facing_mode": "user",
        "audio": False,
        "video": {"width": 400, "height": 400},
    },
    layout=Layout(width="400px"),
)

# Image recorder for taking a snapshot
w_imrecorder = ImageRecorder(
    format="jpg", stream=w_cam, layout=Layout(padding="0 0 0 100px")
)

# Text widget to show our classification results
w_text = HTML(layout=Layout(padding="0 0 0 100px"))
def predict_webcam_frames():
    """ Predict activity by using a pretrained model
    """
    global w_imrecorder, w_text, is_playing
    global device, model

    # Use deque for sliding window over frames
    window = deque()
    scores_cache = deque()
    scores_sum = np.zeros(len(LABELS))

    while is_playing:
        try:
            # Get the image (RGBA) and convert to RGB
            im = Image.open(io.BytesIO(w_imrecorder.image.value)).convert("RGB")
            window.append(np.array(im))

            # update println func
            def update_println(println):
                w_text.value = println
            
            if len(window) == NUM_FRAMES:
                learner.predict_frames(
                    window,
                    scores_cache,
                    scores_sum,
                    None,
                    AVERAGING_SIZE,
                    SCORE_THRESHOLD,
                    LABELS,
                    TARGET_LABELS,
                    get_transforms(train=False), 
                    update_println,
                )
            else:
                w_text.value = "Preparing..."
        except OSError:
            # If im_recorder doesn't have valid image data, skip it.
            pass
        except BaseException as e:
            w_text.value = "Exception: " + str(e)
            break

        # Taking the next snapshot programmatically
        w_imrecorder.recording = True
        sleep(0.02)
is_playing = False
#  Once prediciton started, hide image recorder widget for faster fps
def start(_):
    global is_playing
    # Make sure this get called only once
    if not is_playing:
        w_imrecorder.layout.display = "none"
        is_playing = True
        Thread(target=predict_webcam_frames).start()


w_imrecorder.image.observe(start, "value")
HBox([w_cam, w_imrecorder, w_text])
is_playing = False
Widget.close_all()